RQ: A partire dalla stratificazione del dataset per categorie demografiche, è possibile individuare cluster in relazione ai fenomeni osservati?
library(dplyr)
dataset_path <- "u_s_chronic_disease_indicators_cdi.csv"
df <- read.csv(dataset_path, na.strings = c("", "NA", "NULL", "null"))
df_filtered <- df %>%
filter(topic %in% c("Diabetes", "Cardiovascular Disease"))
df_filtered %>%
filter(stratificationcategory1 == "Race/Ethnicity") %>%
count(stratification1) %>%
arrange(desc(n))
NA
library(dplyr)
qid_map <- df_filtered %>%
distinct(questionid, question) %>%
arrange(questionid)
# Mostra la tabella
knitr::kable(qid_map, col.names = c("QuestionID", "Question"))
| QuestionID | Question |
|---|---|
| CVD10_1 | Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with a history of coronary heart disease |
| CVD10_2 | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with a history of coronary heart disease |
| CVD1_1 | Mortality from total cardiovascular disease |
| CVD1_2 | Mortality from diseases of the heart |
| CVD1_3 | Mortality from coronary heart disease |
| CVD1_4 | Mortality from heart failure |
| CVD1_5 | Mortality from cerebrovascular disease (stroke) |
| CVD2_0 | Hospitalization for heart failure among Medicare-eligible persons aged >= 65 years |
| CVD3_1 | Hospitalization for stroke |
| CVD3_2 | Hospitalization for acute myocardial infarction |
| CVD4_0 | Cholesterol screening among adults aged >= 18 years |
| CVD5_0 | High cholesterol prevalence among adults aged >= 18 years |
| CVD6_1 | Awareness of high blood pressure among adults aged >= 18 years |
| CVD6_2 | Awareness of high blood pressure among women aged 18-44 years |
| CVD7_0 | Taking medicine for high blood pressure control among adults aged >= 18 years with high blood pressure |
| CVD8_0 | Pre-pregnancy hypertension |
| CVD9_1 | Influenza vaccination among noninstitutionalized adults aged 18-64 years with a history of coronary heart disease or stroke |
| CVD9_2 | Influenza vaccination among noninstitutionalized adults aged >= 65 years with a history of coronary heart disease or stroke |
| DIA10_0 | Adults with diagnosed diabetes aged >= 18 years who have taken a diabetes self-management course |
| DIA11_1 | Prevalence of high cholesterol among adults aged >= 18 years with diagnosed diabetes |
| DIA11_2 | Prevalence of high blood pressure among adults aged >= 18 years with diagnosed diabetes |
| DIA11_3 | Prevalence of depressive disorders among adults aged >= 18 years with diagnosed diabetes |
| DIA12_1 | Influenza vaccination among noninstitutionalized adults aged 18-64 years with diagnosed diabetes |
| DIA12_2 | Influenza vaccination among noninstitutionalized adults aged >= 65 years with diagnosed diabetes |
| DIA13_1 | Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with diagnosed diabetes |
| DIA13_2 | Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with diagnosed diabetes |
| DIA1_1 | Mortality due to diabetes reported as any listed cause of death |
| DIA1_2 | Mortality with diabetic ketoacidosis reported as any listed cause of death |
| DIA2_1 | Prevalence of diagnosed diabetes among adults aged >= 18 years |
| DIA2_2 | Diabetes prevalence among women aged 18-44 years |
| DIA3_1 | Prevalence of pre-pregnancy diabetes |
| DIA3_2 | Prevalence of gestational diabetes |
| DIA4_0 | Amputation of a lower extremity attributable to diabetes |
| DIA5_0 | Foot examination among adults aged >= 18 years with diagnosed diabetes |
| DIA6_0 | Glycosylated hemoglobin measurement among adults aged >= 18 years with diagnosed diabetes |
| DIA7_0 | Dilated eye examination among adults aged >= 18 years with diagnosed diabetes |
| DIA8_0 | Visits to dentist or dental clinic among adults aged >= 18 years with diagnosed diabetes |
| DIA9_0 | Hospitalization with diabetes as a listed diagnosis |
NA
library(dplyr)
# conversione in numerico di datavalue (da eventuale stringa) e gestione virgole (da , a .)
df_k <- df_filtered %>%
mutate(
datavalue_num = suppressWarnings(as.numeric(gsub(",", ".", datavalue, fixed = TRUE))),
.row = dplyr::row_number()
)
library(dplyr) # gestione dati
library(purrr) # funzioni di mappatura
set.seed(9) # seme per riproducibilitÃ
clustered <- df_k %>%
group_by(topic, questionid, datavaluetypeid) %>% # raggruppa per variabili chiave
group_modify(~{ # lavora su ogni gruppo
d <- .x # sottoinsieme del gruppo
d_non_na <- d %>% filter(!is.na(datavalue_num)) # tiene solo valori non NA
if (nrow(d_non_na) < 7) { # se meno di 7 righe
d$cluster <- NA_integer_ # assegna NA al cluster
d # restituisce i dati così
} else {
km <- kmeans(d_non_na$datavalue_num, # kmeans su 7 cluster
centers = 7, nstart = 25)
d_non_na$cluster <- km$cluster # assegna etichette cluster
d %>% left_join(d_non_na %>% # riunisce cluster con dati
select(.row, cluster),
by = ".row")
}
}) %>%
ungroup() # toglie il raggruppamento
clustered %>%
select(topic, questionid, datavaluetypeid, # mostra colonne chiave
datavalue_num, cluster) %>%
head() # prime righe
NA
NA
library(dplyr)
library(tidyr)
# calcolo delle percentuali per ogni cluster e stratification
cluster_strat_pct <- clustered %>%
# filtra solo i record con categoria race/ethnicity ed esclude cluster mancanti
filter(stratificationcategory1 == "Race/Ethnicity", !is.na(cluster)) %>%
# raggruppa per topic, questionid, datavaluetypeid e cluster
group_by(topic, questionid, datavaluetypeid, cluster) %>%
# calcola la dimensione totale del cluster
mutate(cluster_total = n()) %>%
# raggruppa ulteriormente per stratification1
group_by(topic, questionid, datavaluetypeid, cluster, stratification1) %>%
# calcola il numero di elementi per ogni combinazione
summarise(
n = n(),
cluster_total = dplyr::first(cluster_total),
.groups = "drop_last"
) %>%
# calcola la percentuale di ciascuna categoria sul totale del cluster
mutate(pct = 100 * n / cluster_total) %>%
# rimuove i raggruppamenti
ungroup() %>%
# ordina i risultati
arrange(topic, questionid, datavaluetypeid, cluster, desc(pct))
# anteprima delle prime 20 righe della tabella finale
cluster_strat_pct %>% head(20)
NA
library(dplyr) # gestione dati
library(tidyr) # funzioni di reshaping
cluster_strat_pct <- clustered %>%
filter(stratificationcategory1 == "Race/Ethnicity", !is.na(cluster)) %>% # tiene solo race/ethnicity e cluster validi
group_by(topic, questionid, datavaluetypeid, cluster) %>% # raggruppa per cluster e variabili base
mutate(cluster_total = n()) %>% # totale righe per cluster
group_by(topic, questionid, datavaluetypeid, cluster, stratification1) %>% # aggiunge stratificazione
summarise(
n = n(), # conta righe per gruppo
cluster_total = dplyr::first(cluster_total), # riporta totale cluster
.groups = "drop_last"
) %>%
mutate(pct = 100 * n / cluster_total) %>% # percentuale sul cluster
ungroup() %>% # toglie il raggruppamento
arrange(topic, questionid, datavaluetypeid, cluster, desc(pct)) # ordina in modo leggibile
cluster_strat_pct %>% head(20) # anteprima prime 20 righe
NA
# crea una tabella con le combinazioni uniche di topic, questionid e datavaluetypeid ordinate
combos <- cluster_strat_pct %>%
dplyr::distinct(topic, questionid, datavaluetypeid) %>%
dplyr::arrange(topic, questionid, datavaluetypeid)
# ciclo per generare un grafico per ciascuna combinazione
for (i in seq_len(nrow(combos))) {
# estrae i valori della combinazione corrente
tp <- combos$topic[i]
qid <- combos$questionid[i]
dvt <- combos$datavaluetypeid[i]
# genera il grafico a torta per la combinazione corrente
p <- plot_pies_for_combo(tp, qid, dvt)
# stampa il grafico
print(p)
}